# load functions and libraries
# This directory should have the MotrpacRatTraining6moQCRep repository
# clone from: https://github.com/MoTrPAC/MotrpacRatTraining6moQCRep
repo_dir = '~/Desktop/repos'
source(paste0(repo_dir,"/tools/unsupervised_normalization_functions.R"))
source(paste0(repo_dir,"/tools/gcp_functions.R"))
source(paste0(repo_dir,"/tools/qc_report_aux_functions.R"))
source(paste0(repo_dir,"/tools/config_session.R"))
source(paste0(repo_dir,"/tools/association_analysis_methods.R"))
source(paste0(repo_dir,"/tools/pi1_cook_fx.R"))
source(paste0(repo_dir,"/tools/get_fx.R"))
# The system's path to the gsutil command
gsutil_cmd = "~/google-cloud-sdk/bin/gsutil"
# set a tmp dir for downloading the data
tmpdir = "ADDPATH"
# load counts from GCP buckets
# these paths were removed for security reasons, see the repo's readme
input_bucket = "ADDPATH"
output_bucket = "ADDPATH"
# Specify bucket and local path for the phenotypic data
pheno_bucket = "ADDPATH"
# specify parameters for filtering lowly expressed peaks and normalization
# keep peaks with at least 10 counts in 4 samples
min_counts = 10
present_in_n_samples = 4
#norm_method = "quantile"
# specify how many PCs to examine
num_features = 5000
num_pcs = 5
num_pcs_for_outlier_analysis = 3
#min_pc_ve = 0.05
# Specify the significance level for association analysis
# (e.g., between a PC and a clinical variable)
p_thr = 0.0001
Specifiy the pipeline, metadata, and sample variables for the analysis.
# Define technical and biological variables to be considered in the QC
pipeline_qc_cols = c("Sample_batch", # batch of samples manually processed together
"Lib_DNA_conc", # library concentration before pooling for sequencing
"Nuclei_tagmentation", # at the moment, sites report this differently
"peak_enrich.frac_reads_in_peaks.macs2.frip",
"replication.num_peaks.num_peaks",
"align.samstat.pct_mapped_reads",
"align.dup.pct_duplicate_reads",
"pct_chrM",
"total_primary_alignments", # similar to number of raw reads
"align.nodup_samstat.total_reads", # number of paired-end reads that go into peak calling
"align_enrich.tss_enrich.tss_enrich",
"align.frag_len_stat.mono_nuc_peak_exists")
biospec_cols = c("registration.sex","key.sacrificetime",
"animal.key.is_control",
"terminal.weight.bw",
"bid","pid","group","viallabel")
# load merged experimental and pipeline meta
# change path for internal release
qa_qc = system(sprintf("gsutil ls %s*qa-qc*", input_bucket), intern=T)
atacseq_meta = dl_read_gcp(qa_qc, sep=',', tmpdir=tmpdir) # dl_read_gcp() from get_fx.R
atacseq_meta[,viallabel := as.character(viallabel)]
# load pheno -------------------------------------------------------------------------
# Metadata from DMAQC: use the merged data frame
# These paths were removed, see the repo's readme
dmaqc_metadata = 'ADDPATH'
dmaqc_dict = 'ADDPATH'
# download and format phenotypic data
dmaqc_metadata = dl_read_gcp(dmaqc_metadata, sep='\t')
cols = dl_read_gcp(dmaqc_dict, sep='\t')
old_cols = colnames(dmaqc_metadata)
new_cols = tolower(cols[match(old_cols, BICUniqueID), FullName])
colnames(dmaqc_metadata) = new_cols # this isn't perfect, but we don't care about the columns it doesn't work for for now
# make some variables human-readable
# create new variables "protocol", "agegroup", "intervention", "sacrificetime", "sex" with readable strings
for (var in c('key.protocol','key.agegroup','key.intervention','key.sacrificetime','registration.sex')){
d = cols[Field.Name == gsub('.*\\.','',var)]
keys=unname(unlist(strsplit(d[,Categorical.Values],'\\|')))
values=tolower(unname(unlist(strsplit(d[,Categorical.Definitions],'\\|'))))
names(values) = keys
# match keys to values; create new column
new_var = gsub(".*\\.","",var)
dmaqc_metadata[,(new_var) := unname(values)[match(get(var), names(values))]]
}
# clean up "sacrificetime"
dmaqc_metadata[,sacrificetime := sapply(sacrificetime, function(x) gsub(' week.*','W',x))]
# clean up 'intervention'
dmaqc_metadata[grepl('training',intervention), intervention := 'training']
# make "group" - "1W", "2W", "4W", "8W", "SED"
dmaqc_metadata[,group := sacrificetime]
dmaqc_metadata[intervention == 'control', group := 'SED']
# make tech ID a string
dmaqc_metadata[,tissue := specimen.processing.sampletypedescription]
dmaqc_metadata[,viallabel := as.character(viallabel)]
# merge -------------------------------------------------------------------------
# merge qc
atacseq_meta = merge(dmaqc_metadata, atacseq_meta, by='viallabel')
# convert to data.frame for compatibility with other QC report code
atacseq_meta = as.data.frame(atacseq_meta, stringsAsFactors=F)
rownames(atacseq_meta) = atacseq_meta$viallabel
atacseq_meta$animal.key.is_control = atacseq_meta$key.intervention == 3
atacseq_data = list()
atacseq_data$data_tables = list()
atacseq_data$pheno = list()
for(file in system(sprintf("gsutil ls %s*atac.counts*",input_bucket), intern=T)){
tissue_code = gsub("\\..*","",basename(file))
tissue = bic_animal_tissue_code$tissue_name_release[bic_animal_tissue_code$bic_tissue_code==tissue_code]
counts = dl_read_gcp(file, sep='\t', tmpdir=tmpdir) # data.table
peaks = counts[,.(chrom, start, end)]
counts = data.frame(counts, check.names=F)
# name feature
rownames(counts) = paste0(counts$chrom, ':', counts$start, '-', counts$end)
counts[,c("chrom","start","end")] = NULL
# get metadata
curr_meta = atacseq_meta[atacseq_meta$viallabel %in% colnames(counts),]
# remove ref std
counts = counts[,curr_meta$viallabel]
for(get_site in unique(curr_meta$GET_site)){
vl = curr_meta$viallabel[curr_meta$GET_site == get_site]
site_meta = curr_meta[curr_meta$viallabel %in% vl,]
site_counts = counts[,vl]
dataset = sprintf("%s,%s", tolower(get_site), tissue)
atacseq_data$data_tables[[dataset]] = site_counts
atacseq_data$pheno[[dataset]] = site_meta
}
}
# the same peaks are used for all tissues
widths = peaks[,end] - peaks[,start]
hist(widths, main="Peak widths", breaks=500)
save(atacseq_data, file="atacseq_data-20210510.RData")
Each raw counts table has the same peaks. Write the feature-mapping file and look at distribution of peak annotations.
# make TxDb object
txdb = makeTxDbFromEnsembl(organism="Rattus norvegicus",
release=96)
# annotate peaks
counts = peaks
atac_peaks = counts[,.(chrom, start, end)]
atac_peaks = atac_peaks[grepl("^chr", chrom)] # remove contigs
atac_peaks[,chrom := gsub("chr","",chrom)]
peak = GRanges(seqnames = atac_peaks[,chrom],
ranges = IRanges(as.numeric(atac_peaks[,start]), as.numeric(atac_peaks[,end])))
peakAnno = annotatePeak(peak,
level = "gene",
tssRegion=c(-1000,1000),
TxDb=txdb)
## >> preparing features information... 2021-05-10 05:06:44 PM
## >> identifying nearest features... 2021-05-10 05:06:45 PM
## >> calculating distance from peak to TSS... 2021-05-10 05:07:21 PM
## >> assigning genomic annotation... 2021-05-10 05:07:21 PM
## >> assigning chromosome lengths 2021-05-10 05:07:41 PM
## >> done... 2021-05-10 05:07:41 PM
pa = as.data.table(peakAnno@anno)
# only save peaks with a relationship to a gene
pa[,short_annotation := annotation]
pa[grepl('Exon', short_annotation), short_annotation := 'Exon']
pa[grepl('Intron', short_annotation), short_annotation := 'Intron']
cols=c('seqnames','start','end')
pa[,(cols) := lapply(.SD, as.character), .SDcols=cols]
cols=c('chrom','start','end')
atac_peaks[,(cols) := lapply(.SD, as.character), .SDcols=cols]
pa = merge(pa, atac_peaks, by.x=c('seqnames','start','end'), by.y=c('chrom','start','end'), all=T)
pa[,c('geneChr','strand') := NULL]
pa[,feature_ID := paste0("chr",seqnames,":",start,"-",end)]
# add Ensembl gene names
# Get this file from ext_data and unzip it
gtf = rtracklayer::import(paste0(repo_dir,'genome.gtf'))
gtf = as.data.table(gtf)
gtf = gtf[type=='gene',.(gene_id, gene_name, gene_biotype, protein_id)]
# merge with Ensembl
pa = merge(pa, gtf, by.x='geneId', by.y='gene_id', all.x=T, suffixes=c('_atac','_rna'))
pa[,assay := 'epigen-atac-seq']
setnames(pa, c("geneId","gene_name"), c("ensembl_gene","gene_symbol"))
setcolorder(pa, c("assay","feature_ID","ensembl_gene","gene_symbol"))
# write to MAWG bucket
write_to_bucket(pa, "pass1b-06_epigen-atac-seq_feature-mapping_20210510.txt", "ADDPATH")
# plot
pa_table2 = data.table(table(pa[,short_annotation]))
pa_table2 = pa_table2[order(N, decreasing=T)]
ggplot(pa, aes(x=short_annotation, fill=short_annotation)) +
geom_bar() +
theme_classic() +
scale_fill_brewer(palette = "Set1") +
labs(y='N annotated peaks (log10 scale)', fill='Peak annotation') +
theme(axis.text.x = element_text(angle = 90, hjust=1, colour='black', vjust=0.5),
axis.title.x = element_blank()) +
scale_y_continuous(trans="log10", breaks=c(1, 10, 100, 1000, 1e4, 1e5, 1e6)) +
scale_x_discrete(limits = pa_table2[,V1])
pa_table = data.table(table(pa[,short_annotation], pa[,seqnames]))
ggplot(pa_table, aes(x=V2, y=N, fill=V1)) +
geom_bar(stat='identity', position='stack') +
theme_classic() +
scale_fill_brewer(palette = "Set1") +
labs(y='N annotated peaks', fill='Peak annotation', x='Chromosome') +
scale_x_discrete(limits = c(as.character(seq(1,20)), 'X', 'Y'))
The ATAC-seq pipeline manual of procedures (MOP) defines a flagged sample (e.g. potentially problematic) as a sample that satisfies one of the following:
(1) Number of filtered, non-duplicated, non-mitochondrial paired-end reads \(<20M\), (2) Alignment rate \(<80%\), (3) Fraction of reads in sample-level MACS2 peaks \(<0.1\), (4) A nucleosome-free region is not present, (5) Neither a mononucleosome peak nor a dinucleosome peak is present, (6) TSS enrichment < 4
atacseq_meta$pipeline_flags = atacseq_pipeline_flagged_samples(atacseq_meta)
mop_flagged_samples = rownames(atacseq_meta)[nchar(atacseq_meta$pipeline_flags)>0]
We first remove peaks with low read counts across samples. Then we use limma voom to perform quantile normalizaition. For each dataset (i.e., tissue) we store the normalized data table and all additional variables (e.g., clinical data) in a list called norm_atacseq_data. ATAC-seq specific: we show the distribution of peak widths and genomic annotations among the filtered peaks in each tissue.
norm_atacseq_data = list()
for(dataset in names(atacseq_data$data_tables)){
unnorm_counts = atacseq_data$data_tables[[dataset]]
curr_samples = colnames(unnorm_counts)
# remove non-auto peaks
counts = unnorm_counts[grepl("^chr[0-9]|^chrY|^chrX", rownames(unnorm_counts)),]
# exclude low count peaks in the current dataset
# at least min_counts counts in present_in_n_samples samples
filt_counts = counts[rowSums(data.frame(lapply(counts, function(x) as.numeric(x >= min_counts)), check.names=F)) >= present_in_n_samples,]
# quantile normalize
# this takes a couple of minutes given the size of the peak x sample counts matrix
voom_obj = voom(filt_counts,normalize.method = "quantile")
norm_counts = round(voom_obj$E,2)
# get the dataset metadata
curr_meta1 = atacseq_data$pheno[[dataset]][curr_samples,pipeline_qc_cols]
curr_meta2 = atacseq_data$pheno[[dataset]][curr_samples,biospec_cols]
norm_atacseq_data[[dataset]] = list(
norm_counts = norm_counts,
filt_counts = filt_counts,
raw_counts = unnorm_counts,
pipeline_meta = curr_meta1,
dmaqc_meta = curr_meta2)
}
save(norm_atacseq_data, file="norm_atacseq_data-0-20210510.RData")
Examine the effect of the normalization process. For each dataset show the boxplots of 20 randomly selected samples.
for(i in 1:length(norm_atacseq_data)){
unnorm_data = norm_atacseq_data[[i]][["raw_counts"]]
norm_data = norm_atacseq_data[[i]][["norm_counts"]]
unnorm_data = log2(unnorm_data+1)
samp = sample(1:ncol(norm_data))[1:20]
curr_name = names(norm_atacseq_data)[i]
colnames(unnorm_data) = NULL
colnames(norm_data) = NULL
boxplot(unnorm_data[,samp],
main=paste0(curr_name,"\nlog2 raw counts (",nrow(unnorm_counts)," peaks)"),
ylab = "log2 raw counts",xaxt="n")
boxplot(norm_data[,samp],
main=paste0(curr_name,"\nquantile norm (",nrow(norm_data)," peaks)"),
ylab = "quantile normalized",xaxt="n")
}
We plot the top two PCs for each tissue, color and shape correspond to randomization group and sex.
for(currname in names(norm_atacseq_data)){
# run PCA
curr_data = norm_atacseq_data[[currname]][["norm_counts"]]
# keep N features with highest CVs
cv = apply(curr_data, 1, function(x) (sd(x)/mean(x))*100)
cv = cv[order(cv, decreasing=T)]
pca_data = curr_data[names(cv)[1:num_features],]
curr_pca = prcomp(t(pca_data),scale. = T,center = T)
curr_pcax = curr_pca$x[,1:num_pcs]
explained_var = summary(curr_pca)[["importance"]][3,5]
norm_atacseq_data[[currname]][["pca"]] = list(
pcax = curr_pcax,explained_var=explained_var
)
# plot
df = data.frame(
PC1 = curr_pcax[,1],PC2 = curr_pcax[,2],
group = norm_atacseq_data[[currname]][["dmaqc_meta"]][,"group"],
sex = as.character(norm_atacseq_data[[currname]][["dmaqc_meta"]][,"registration.sex"]),
stringsAsFactors = F
)
df$sex[df$sex=="1"] = "F"
df$sex[df$sex=="2"] = "M"
df$group = factor(df$group, levels = c("SED",
"1W",
"2W",
"4W",
"8W"))
p = ggplot(df) +
geom_point(aes(x=PC1, y=PC2,fill=group, shape=sex), colour="black", size=2) +
ggtitle(currname) +
theme_classic() +
scale_fill_manual(values=c(MotrpacBicQC::group_cols)) +
scale_shape_manual(values=c('F'=21, 'M'=24)) +
labs(x=sprintf("PC1 (%s%%)", signif(summary(curr_pca)[["importance"]][2,1]*100, digits=3)),
y=sprintf("PC2 (%s%%)", signif(summary(curr_pca)[["importance"]][2,2]*100, digits=3))) +
guides(fill=guide_legend(override.aes = list(shape=21)))
plot(p)
}
save(norm_atacseq_data, file="norm_atacseq_data-1-20210510.RData")
Here we analyze the top principal components (5) in each tissue and compute their association with the selected variables above (e.g., the pipeline qc metrics). We use Spearman correlation and a linear test for significance.
For each dataset we also add the correlations among the metadata variables. These two analyses should be interpreted together, as the analyzed variables are not independent.
pcs_vs_qc_var_report = c()
metadata_variable_assoc_report = c()
for(currname in names(norm_atacseq_data)){
curr_meta = cbind(norm_atacseq_data[[currname]]$pipeline_meta,
norm_atacseq_data[[currname]]$dmaqc_meta)
# remove the text group description
curr_meta = curr_meta[!grepl("randgr",names(curr_meta))]
# remove metadata variables with NAs
curr_meta = curr_meta[,!apply(is.na(curr_meta),2,any)]
# remove bid and pid
curr_meta = curr_meta[,!grepl("pid|bid|viallabel|group",names(curr_meta))]
# remove 0-variance meta
to_remove = colnames(curr_meta)[lapply(curr_meta, stats::var) == 0]
curr_meta[,to_remove] = NULL
# take the PCA results
curr_pcax = norm_atacseq_data[[currname]][["pca"]][["pcax"]]
corrs = cor(curr_pcax,curr_meta,method="spearman")
corrsp = pairwise_eval(
curr_pcax,curr_meta,func=pairwise_association_wrapper,
f=1)
# Some ggplots have to be printed to be shown in the notebook
print(ggcorrplot(t(corrs),lab=T,lab_size=2.5,hc.order = F) +
ggtitle(currname) +
theme(plot.title = element_text(hjust = 0.5,size=20)))
for(i in 1:nrow(corrsp)){
for(j in 1:ncol(corrsp)){
if(corrsp[i,j]>p_thr){next}
pcs_vs_qc_var_report = rbind(pcs_vs_qc_var_report,
c(currname,
rownames(corrsp)[i],colnames(corrsp)[j],corrs[i,j],corrsp[i,j])
)
}
}
# compute correlations among the metadata variables
corrs = cor(curr_meta,method="spearman")
corrsp = pairwise_eval(
curr_meta,func=pairwise_association_wrapper,
f=1)
# Some ggplots have to be printed to be shown in the notebook
print(ggcorrplot(corrs,lab=T,lab_size=1.5,hc.order=T))
for(n1 in rownames(corrsp)){
for(n2 in rownames(corrsp)){
if(n1==n2){break}
if(n1 %in% biospec_cols &&
n2 %in% biospec_cols) {next}
if(corrsp[n1,n2]>p_thr){next}
metadata_variable_assoc_report = rbind(metadata_variable_assoc_report,
c(currname,n1,n2,corrs[n1,n2],corrsp[n1,n2])
)
}
}
}
# Format the reports - for a nicer presentation in a table
colnames(metadata_variable_assoc_report) = c(
"Dataset(tissue,site)","Var1","Var2","rho(spearman)","p-value")
colnames(pcs_vs_qc_var_report) = c("Dataset(tissue,site)","PC",
"qc_metric","rho(spearman)","p-value")
pcs_vs_qc_var_report[,5] = format(
as.numeric(pcs_vs_qc_var_report[,5]),digits=3)
pcs_vs_qc_var_report[,4] = format(
as.numeric(pcs_vs_qc_var_report[,4]),digits=3)
metadata_variable_assoc_report[,5] = format(
as.numeric(metadata_variable_assoc_report[,5]),digits=3)
metadata_variable_assoc_report[,4] = format(
as.numeric(metadata_variable_assoc_report[,4]),digits=3)
kable(pcs_vs_qc_var_report,longtable=T,
caption = "Correlations between PCs and other variables") %>%
kable_styling(latex_options = c("repeat_header"),font_size = 8)
| Dataset(tissue,site) | PC | qc_metric | rho(spearman) | p-value |
|---|---|---|---|---|
| stanford,t52-hippocampus | PC1 | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.8675 | 5.60e-15 |
| stanford,t52-hippocampus | PC1 | align_enrich.tss_enrich.tss_enrich | 0.7611 | 2.40e-14 |
| mssm,t55-gastrocnemius | PC1 | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.8645 | 3.34e-20 |
| mssm,t55-gastrocnemius | PC1 | replication.num_peaks.num_peaks | -0.8111 | 2.53e-06 |
| mssm,t55-gastrocnemius | PC1 | align_enrich.tss_enrich.tss_enrich | 0.5997 | 1.39e-16 |
| mssm,t55-gastrocnemius | PC3 | Sample_batch | -0.7693 | 1.21e-09 |
| mssm,t55-gastrocnemius | PC3 | align.samstat.pct_mapped_reads | 0.1169 | 1.79e-05 |
| mssm,t55-gastrocnemius | PC3 | pct_chrM | 0.1947 | 1.49e-08 |
| mssm,t55-gastrocnemius | PC3 | align.frag_len_stat.mono_nuc_peak_exists | -0.3998 | 8.09e-07 |
| mssm,t55-gastrocnemius | PC3 | key.sacrificetime | 0.7450 | 1.72e-06 |
| mssm,t55-gastrocnemius | PC4 | registration.sex | 0.7359 | 1.47e-05 |
| stanford,t55-gastrocnemius | PC1 | peak_enrich.frac_reads_in_peaks.macs2.frip | -0.9406 | 2.50e-20 |
| stanford,t55-gastrocnemius | PC1 | replication.num_peaks.num_peaks | 0.7543 | 8.89e-08 |
| stanford,t55-gastrocnemius | PC1 | align_enrich.tss_enrich.tss_enrich | -0.9014 | 2.24e-18 |
| stanford,t58-heart | PC1 | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.8039 | 5.60e-12 |
| stanford,t58-heart | PC1 | align_enrich.tss_enrich.tss_enrich | 0.8496 | 3.73e-17 |
| stanford,t58-heart | PC2 | replication.num_peaks.num_peaks | 0.2123 | 3.33e-07 |
| stanford,t58-heart | PC2 | total_primary_alignments | 0.2087 | 4.70e-08 |
| stanford,t58-heart | PC2 | align.nodup_samstat.total_reads | 0.2637 | 2.54e-06 |
| stanford,t58-heart | PC5 | align.frag_len_stat.mono_nuc_peak_exists | 0.2425 | 8.36e-11 |
| stanford,t59-kidney | PC1 | peak_enrich.frac_reads_in_peaks.macs2.frip | -0.9021 | 2.62e-24 |
| stanford,t59-kidney | PC1 | align.dup.pct_duplicate_reads | -0.6464 | 8.50e-09 |
| stanford,t59-kidney | PC1 | pct_chrM | -0.6166 | 1.71e-09 |
| stanford,t59-kidney | PC1 | align_enrich.tss_enrich.tss_enrich | -0.5635 | 6.03e-11 |
| stanford,t59-kidney | PC1 | registration.sex | -0.6777 | 8.26e-06 |
| stanford,t59-kidney | PC1 | terminal.weight.bw | -0.6555 | 1.64e-06 |
| stanford,t59-kidney | PC2 | align_enrich.tss_enrich.tss_enrich | -0.5319 | 5.30e-05 |
| stanford,t59-kidney | PC2 | registration.sex | 0.7553 | 4.88e-11 |
| stanford,t59-kidney | PC2 | terminal.weight.bw | 0.5775 | 3.34e-08 |
| stanford,t66-lung | PC1 | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.9733 | 7.55e-28 |
| stanford,t66-lung | PC1 | align.dup.pct_duplicate_reads | 0.6357 | 4.73e-05 |
| stanford,t66-lung | PC1 | pct_chrM | 0.8193 | 1.45e-13 |
| stanford,t66-lung | PC1 | total_primary_alignments | -0.5204 | 3.51e-06 |
| stanford,t66-lung | PC1 | align.nodup_samstat.total_reads | -0.4843 | 1.33e-05 |
| stanford,t66-lung | PC1 | align_enrich.tss_enrich.tss_enrich | 0.8921 | 5.23e-25 |
| stanford,t66-lung | PC1 | align.frag_len_stat.mono_nuc_peak_exists | 0.5732 | 1.49e-09 |
| stanford,t66-lung | PC2 | total_primary_alignments | 0.3480 | 8.33e-05 |
| stanford,t66-lung | PC2 | align.nodup_samstat.total_reads | 0.3779 | 2.89e-05 |
| stanford,t68-liver | PC1 | pct_chrM | 0.4782 | 3.05e-05 |
| stanford,t68-liver | PC1 | registration.sex | -0.8662 | 8.06e-28 |
| stanford,t68-liver | PC1 | terminal.weight.bw | -0.7771 | 5.64e-22 |
| stanford,t68-liver | PC2 | align.dup.pct_duplicate_reads | 0.5125 | 2.22e-06 |
| stanford,t68-liver | PC2 | align_enrich.tss_enrich.tss_enrich | 0.1139 | 5.95e-05 |
| stanford,t68-liver | PC3 | peak_enrich.frac_reads_in_peaks.macs2.frip | -0.6246 | 5.80e-07 |
| stanford,t69-brown-adipose | PC1 | peak_enrich.frac_reads_in_peaks.macs2.frip | -0.7860 | 5.32e-08 |
| stanford,t69-brown-adipose | PC1 | align_enrich.tss_enrich.tss_enrich | -0.6562 | 1.58e-05 |
| stanford,t69-brown-adipose | PC4 | total_primary_alignments | 0.4543 | 3.14e-06 |
| stanford,t69-brown-adipose | PC4 | align.nodup_samstat.total_reads | 0.3225 | 1.20e-05 |
| mssm,t70-white-adipose | PC1 | Sample_batch | 0.4880 | 4.99e-08 |
| mssm,t70-white-adipose | PC1 | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.8824 | 7.16e-19 |
| mssm,t70-white-adipose | PC1 | align.dup.pct_duplicate_reads | 0.6217 | 1.77e-11 |
| mssm,t70-white-adipose | PC1 | pct_chrM | 0.7242 | 3.12e-15 |
| mssm,t70-white-adipose | PC1 | align_enrich.tss_enrich.tss_enrich | 0.9527 | 3.19e-27 |
| mssm,t70-white-adipose | PC1 | registration.sex | 0.5724 | 2.66e-06 |
| mssm,t70-white-adipose | PC1 | terminal.weight.bw | 0.3814 | 2.45e-05 |
| mssm,t70-white-adipose | PC3 | Sample_batch | 0.1303 | 4.06e-05 |
| mssm,t70-white-adipose | PC3 | registration.sex | -0.5336 | 3.54e-05 |
| mssm,t70-white-adipose | PC3 | terminal.weight.bw | -0.4644 | 5.33e-05 |
| stanford,t70-white-adipose | PC1 | replication.num_peaks.num_peaks | -0.3395 | 3.39e-06 |
| stanford,t70-white-adipose | PC1 | total_primary_alignments | -0.5767 | 2.59e-09 |
| stanford,t70-white-adipose | PC1 | align.nodup_samstat.total_reads | -0.5937 | 4.11e-09 |
| stanford,t70-white-adipose | PC2 | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.5753 | 3.62e-08 |
| stanford,t70-white-adipose | PC2 | pct_chrM | 0.6045 | 9.78e-05 |
| stanford,t70-white-adipose | PC2 | align_enrich.tss_enrich.tss_enrich | 0.8338 | 1.83e-12 |
| stanford,t70-white-adipose | PC2 | registration.sex | 0.7415 | 5.71e-08 |
| stanford,t70-white-adipose | PC2 | terminal.weight.bw | 0.7291 | 2.11e-08 |
| stanford,t70-white-adipose | PC3 | replication.num_peaks.num_peaks | 0.0832 | 3.28e-08 |
| stanford,t70-white-adipose | PC4 | replication.num_peaks.num_peaks | -0.0075 | 1.22e-14 |
kable(metadata_variable_assoc_report,longtable=T,
caption = "Correlations between metadata variables")%>%
kable_styling(latex_options = c("repeat_header"),font_size = 8)
| Dataset(tissue,site) | Var1 | Var2 | rho(spearman) | p-value |
|---|---|---|---|---|
| stanford,t52-hippocampus | pct_chrM | align.dup.pct_duplicate_reads | 0.961 | 4.89e-29 |
| stanford,t52-hippocampus | total_primary_alignments | replication.num_peaks.num_peaks | 0.659 | 2.49e-05 |
| stanford,t52-hippocampus | align.nodup_samstat.total_reads | replication.num_peaks.num_peaks | 0.591 | 2.44e-05 |
| stanford,t52-hippocampus | align.nodup_samstat.total_reads | total_primary_alignments | 0.868 | 6.44e-17 |
| stanford,t52-hippocampus | align_enrich.tss_enrich.tss_enrich | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.927 | 1.74e-29 |
| mssm,t55-gastrocnemius | replication.num_peaks.num_peaks | peak_enrich.frac_reads_in_peaks.macs2.frip | -0.715 | 9.83e-09 |
| mssm,t55-gastrocnemius | align.dup.pct_duplicate_reads | Lib_DNA_conc | -0.708 | 2.40e-06 |
| mssm,t55-gastrocnemius | align.dup.pct_duplicate_reads | align.samstat.pct_mapped_reads | -0.362 | 1.15e-05 |
| mssm,t55-gastrocnemius | pct_chrM | align.samstat.pct_mapped_reads | -0.357 | 7.84e-08 |
| mssm,t55-gastrocnemius | align.nodup_samstat.total_reads | total_primary_alignments | 0.946 | 2.19e-18 |
| mssm,t55-gastrocnemius | align_enrich.tss_enrich.tss_enrich | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.850 | 7.81e-23 |
| mssm,t55-gastrocnemius | align_enrich.tss_enrich.tss_enrich | replication.num_peaks.num_peaks | -0.622 | 5.31e-06 |
| mssm,t55-gastrocnemius | align.frag_len_stat.mono_nuc_peak_exists | align.samstat.pct_mapped_reads | 0.394 | 4.77e-06 |
| mssm,t55-gastrocnemius | align.frag_len_stat.mono_nuc_peak_exists | pct_chrM | -0.411 | 1.88e-28 |
| mssm,t55-gastrocnemius | key.sacrificetime | Sample_batch | -0.852 | 2.03e-16 |
| stanford,t55-gastrocnemius | align.dup.pct_duplicate_reads | Lib_DNA_conc | -0.672 | 2.02e-07 |
| stanford,t55-gastrocnemius | align.nodup_samstat.total_reads | total_primary_alignments | 0.974 | 8.41e-40 |
| stanford,t55-gastrocnemius | align_enrich.tss_enrich.tss_enrich | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.977 | 3.36e-33 |
| stanford,t55-gastrocnemius | registration.sex | align.dup.pct_duplicate_reads | 0.547 | 7.58e-05 |
| stanford,t58-heart | align.dup.pct_duplicate_reads | Lib_DNA_conc | -0.706 | 2.95e-08 |
| stanford,t58-heart | pct_chrM | align.dup.pct_duplicate_reads | 0.824 | 7.53e-11 |
| stanford,t58-heart | align.nodup_samstat.total_reads | Lib_DNA_conc | 0.638 | 3.52e-06 |
| stanford,t58-heart | align.nodup_samstat.total_reads | pct_chrM | -0.512 | 7.59e-06 |
| stanford,t58-heart | align.nodup_samstat.total_reads | total_primary_alignments | 0.807 | 3.60e-16 |
| stanford,t58-heart | align_enrich.tss_enrich.tss_enrich | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.926 | 2.97e-22 |
| stanford,t59-kidney | align.dup.pct_duplicate_reads | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.547 | 3.07e-07 |
| stanford,t59-kidney | pct_chrM | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.510 | 1.53e-07 |
| stanford,t59-kidney | pct_chrM | align.dup.pct_duplicate_reads | 0.928 | 3.06e-27 |
| stanford,t59-kidney | align.nodup_samstat.total_reads | total_primary_alignments | 0.949 | 4.36e-28 |
| stanford,t59-kidney | align_enrich.tss_enrich.tss_enrich | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.812 | 5.34e-19 |
| stanford,t59-kidney | align_enrich.tss_enrich.tss_enrich | replication.num_peaks.num_peaks | -0.457 | 1.27e-05 |
| stanford,t59-kidney | align_enrich.tss_enrich.tss_enrich | pct_chrM | 0.326 | 4.48e-05 |
| stanford,t66-lung | Lib_DNA_conc | Sample_batch | -0.632 | 2.05e-06 |
| stanford,t66-lung | pct_chrM | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.717 | 3.51e-10 |
| stanford,t66-lung | pct_chrM | replication.num_peaks.num_peaks | -0.704 | 1.54e-06 |
| stanford,t66-lung | pct_chrM | align.dup.pct_duplicate_reads | 0.725 | 2.64e-07 |
| stanford,t66-lung | total_primary_alignments | peak_enrich.frac_reads_in_peaks.macs2.frip | -0.477 | 6.36e-05 |
| stanford,t66-lung | align.nodup_samstat.total_reads | total_primary_alignments | 0.993 | 1.01e-52 |
| stanford,t66-lung | align_enrich.tss_enrich.tss_enrich | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.951 | 5.37e-30 |
| stanford,t66-lung | align_enrich.tss_enrich.tss_enrich | pct_chrM | 0.582 | 1.06e-07 |
| stanford,t66-lung | align_enrich.tss_enrich.tss_enrich | total_primary_alignments | -0.372 | 9.31e-05 |
| stanford,t66-lung | align.frag_len_stat.mono_nuc_peak_exists | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.577 | 3.79e-07 |
| stanford,t66-lung | align.frag_len_stat.mono_nuc_peak_exists | total_primary_alignments | -0.577 | 1.16e-08 |
| stanford,t66-lung | align.frag_len_stat.mono_nuc_peak_exists | align.nodup_samstat.total_reads | -0.565 | 3.18e-08 |
| stanford,t66-lung | align.frag_len_stat.mono_nuc_peak_exists | align_enrich.tss_enrich.tss_enrich | 0.577 | 1.02e-08 |
| stanford,t68-liver | align.dup.pct_duplicate_reads | Lib_DNA_conc | -0.584 | 5.67e-05 |
| stanford,t68-liver | align.dup.pct_duplicate_reads | align.samstat.pct_mapped_reads | 0.615 | 2.30e-06 |
| stanford,t68-liver | pct_chrM | align.samstat.pct_mapped_reads | 0.702 | 1.36e-08 |
| stanford,t68-liver | pct_chrM | align.dup.pct_duplicate_reads | 0.773 | 2.18e-09 |
| stanford,t68-liver | align.nodup_samstat.total_reads | total_primary_alignments | 0.846 | 2.76e-28 |
| stanford,t68-liver | align_enrich.tss_enrich.tss_enrich | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.804 | 1.39e-13 |
| stanford,t68-liver | registration.sex | pct_chrM | -0.678 | 9.66e-08 |
| stanford,t68-liver | terminal.weight.bw | pct_chrM | -0.564 | 3.48e-07 |
| stanford,t69-brown-adipose | pct_chrM | align.dup.pct_duplicate_reads | 0.887 | 7.45e-21 |
| stanford,t69-brown-adipose | align.nodup_samstat.total_reads | replication.num_peaks.num_peaks | 0.542 | 6.22e-05 |
| stanford,t69-brown-adipose | align.nodup_samstat.total_reads | total_primary_alignments | 0.933 | 4.47e-31 |
| stanford,t69-brown-adipose | align_enrich.tss_enrich.tss_enrich | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.886 | 1.20e-18 |
| mssm,t70-white-adipose | peak_enrich.frac_reads_in_peaks.macs2.frip | Sample_batch | 0.433 | 1.71e-06 |
| mssm,t70-white-adipose | replication.num_peaks.num_peaks | peak_enrich.frac_reads_in_peaks.macs2.frip | -0.355 | 2.09e-08 |
| mssm,t70-white-adipose | align.dup.pct_duplicate_reads | Sample_batch | 0.268 | 4.74e-07 |
| mssm,t70-white-adipose | align.dup.pct_duplicate_reads | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.690 | 4.68e-14 |
| mssm,t70-white-adipose | align.dup.pct_duplicate_reads | replication.num_peaks.num_peaks | -0.366 | 1.13e-05 |
| mssm,t70-white-adipose | pct_chrM | Sample_batch | 0.475 | 1.63e-08 |
| mssm,t70-white-adipose | pct_chrM | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.799 | 4.01e-19 |
| mssm,t70-white-adipose | pct_chrM | replication.num_peaks.num_peaks | -0.387 | 2.11e-06 |
| mssm,t70-white-adipose | pct_chrM | align.dup.pct_duplicate_reads | 0.879 | 5.06e-27 |
| mssm,t70-white-adipose | align.nodup_samstat.total_reads | total_primary_alignments | 0.960 | 3.48e-52 |
| mssm,t70-white-adipose | align_enrich.tss_enrich.tss_enrich | Sample_batch | 0.497 | 6.36e-08 |
| mssm,t70-white-adipose | align_enrich.tss_enrich.tss_enrich | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.905 | 4.47e-28 |
| mssm,t70-white-adipose | align_enrich.tss_enrich.tss_enrich | replication.num_peaks.num_peaks | -0.370 | 1.66e-06 |
| mssm,t70-white-adipose | align_enrich.tss_enrich.tss_enrich | align.dup.pct_duplicate_reads | 0.640 | 1.03e-13 |
| mssm,t70-white-adipose | align_enrich.tss_enrich.tss_enrich | pct_chrM | 0.757 | 3.40e-18 |
| mssm,t70-white-adipose | registration.sex | Sample_batch | 0.424 | 4.33e-08 |
| mssm,t70-white-adipose | registration.sex | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.636 | 4.03e-05 |
| mssm,t70-white-adipose | registration.sex | align.dup.pct_duplicate_reads | 0.700 | 1.96e-07 |
| mssm,t70-white-adipose | registration.sex | pct_chrM | 0.777 | 2.80e-07 |
| mssm,t70-white-adipose | registration.sex | align_enrich.tss_enrich.tss_enrich | 0.528 | 2.93e-05 |
| mssm,t70-white-adipose | key.sacrificetime | Sample_batch | -0.154 | 5.57e-16 |
| mssm,t70-white-adipose | terminal.weight.bw | Sample_batch | 0.365 | 1.36e-15 |
| mssm,t70-white-adipose | terminal.weight.bw | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.492 | 8.94e-05 |
| mssm,t70-white-adipose | terminal.weight.bw | align.dup.pct_duplicate_reads | 0.609 | 2.56e-07 |
| mssm,t70-white-adipose | terminal.weight.bw | pct_chrM | 0.647 | 5.15e-07 |
| stanford,t70-white-adipose | Lib_DNA_conc | Sample_batch | -0.566 | 1.71e-05 |
| stanford,t70-white-adipose | align.dup.pct_duplicate_reads | replication.num_peaks.num_peaks | 0.339 | 5.51e-07 |
| stanford,t70-white-adipose | total_primary_alignments | replication.num_peaks.num_peaks | 0.340 | 2.58e-06 |
| stanford,t70-white-adipose | align.nodup_samstat.total_reads | replication.num_peaks.num_peaks | 0.340 | 1.04e-05 |
| stanford,t70-white-adipose | align.nodup_samstat.total_reads | total_primary_alignments | 0.994 | 1.10e-54 |
| stanford,t70-white-adipose | align_enrich.tss_enrich.tss_enrich | peak_enrich.frac_reads_in_peaks.macs2.frip | 0.696 | 2.27e-07 |
| stanford,t70-white-adipose | align_enrich.tss_enrich.tss_enrich | pct_chrM | 0.726 | 1.05e-10 |
In this analysis, outliers are flagged by examining the boxplot of each PC, extending its whiskers to three times the inter-quantile range away from the boxplot. Samples outside this range are then flagged.
Note that samples are flagged using an automatic analysis of the principal components. Such analyses may flag samples because of true biological effects and thus further examination is required before determining if flagged samples represent low quality samples.
pca_outliers_report = c()
for(currname in names(norm_atacseq_data)){
curr_pcax = curr_pcax = norm_atacseq_data[[currname]][["pca"]][["pcax"]]
# Univariate: use IQRs
pca_outliers = c()
for(j in 1:num_pcs_for_outlier_analysis){
outlier_values <- boxplot.stats(curr_pcax[,j],coef = 3)$out
for(outlier in names(outlier_values)){
pca_outliers_report = rbind(pca_outliers_report,
c(currname,paste("PC",j,sep=""),outlier,
format(outlier_values[outlier],digits=5))
)
if(!is.element(outlier,names(pca_outliers))){
pca_outliers[outlier] = outlier_values[outlier]
}
}
}
# Plot the outliers
if(length(pca_outliers)>0){
# print(length(kNN_outliers))
df = data.frame(curr_pcax,
outliers = rownames(curr_pcax) %in% names(pca_outliers))
col = rep("black",nrow(df))
col[df$outliers] = "green"
plot(df$PC1,df$PC2,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
main = paste(currname,"\nflagged outliers"),
xlab = "PC1",ylab="PC2")
plot(df$PC3,df$PC4,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
main = paste(currname,"\nflagged outliers"),
xlab = "PC3",ylab="PC4")
}
}
if(!is.null(dim(pca_outliers_report))){
colnames(pca_outliers_report) = c("dataset","PC","sample","score")
}
We verify that the sex of a sample can be predicted from the percent of chromosome Y and percent of chromosome X reads. This is done automatically by training a logistic regression classifier, but we also plot the 2D plot for each tissue.
# sex checks - using simple logistic regression classifier (LOO-CV)
sex_read_info = cbind(atacseq_meta$pct_chrX,atacseq_meta$pct_chrY)
rownames(sex_read_info) = rownames(atacseq_meta)
sex_check_outliers = c()
for(currname in names(norm_atacseq_data)){
# 1 is female
curr_sex = norm_atacseq_data[[currname]]$dmaqc_meta$registration.sex
read_info = sex_read_info[
rownames(norm_atacseq_data[[currname]]$dmaqc_meta),]
df = data.frame(sex = as.character(curr_sex),
pct_chrX = read_info[,1],
pct_chrY = read_info[,2],stringsAsFactors = F)
df$sex[df$sex=="1"] = "F"
df$sex[df$sex=="2"] = "M"
p = ggplot(df) +
geom_point(aes(x=pct_chrX, y=pct_chrY,col=sex, shape=sex)) +
ggtitle(paste0(currname," - sex check")) +
theme_classic()
plot(p)
train_control <- trainControl(method = "cv", number = nrow(df),
savePredictions = TRUE)
# train the model on training set
model <- train(sex ~ .,data = df,
trControl = train_control,
method = "glm", family=binomial(link="logit"))
# CV results
cv_res = model$pred
cv_errors = cv_res[,1] != cv_res[,2]
err_samples = rownames(df)[cv_res[cv_errors,"rowIndex"]]
for(samp in err_samples){
sex_check_outliers = rbind(sex_check_outliers,
c(currname,samp))
}
}
if(! is.null(dim(sex_check_outliers))){
colnames(sex_check_outliers) = c("dataset","sample")
}
# add the dataset name to mop outliers
sample2dataset = paste(tolower(atacseq_meta[,c("Tissue")]),
tolower(atacseq_meta[,c("GET_site")]),sep=",")
names(sample2dataset) = rownames(atacseq_meta)
all_flagged = NULL
if(length(pca_outliers_report)>0){
all_flagged = union(all_flagged,pca_outliers_report[,"sample"])
}
if(length(sex_check_outliers)>0){
all_flagged = union(all_flagged,sex_check_outliers[,"sample"])
}
all_flagged = union(all_flagged,mop_flagged_samples)
flagged_sample_report = c()
for(samp in all_flagged){
samp_dataset = sample2dataset[samp]
samp_metric = NULL
if(samp %in% mop_flagged_samples){
samp_metric = atacseq_meta[samp,"pipeline_flags"]
}
if(!is.null(dim(sex_check_outliers)) &&
samp %in% sex_check_outliers[,"sample"]){
samp_metric = c(samp_metric,"Sex-flagged")
}
if(!is.null(dim(pca_outliers_report)) &&
samp %in% pca_outliers_report[,"sample"]){
curr_pcs = pca_outliers_report[pca_outliers_report[,"sample"]==samp,"PC"]
samp_metric = c(samp_metric,curr_pcs)
}
flagged_sample_report = rbind(flagged_sample_report,
c(samp,samp_dataset,paste(samp_metric,collapse=","))
)
}
colnames(flagged_sample_report) = c("Viallabel","Dataset","Methods")
kable(flagged_sample_report,longtable=T,
caption = "Outliers detected, MOP, Sex check, and by tissue PCA data")%>%
kable_styling(font_size = 8,latex_options = c("hold_position", "repeat_header"))
| Viallabel | Dataset | Methods |
|---|---|---|
| 90406015204 | hippocampus,stanford | PC1,PC2 |
| 90423015204 | hippocampus,stanford | PC1,PC2 |
| 90239015504 | gastrocnemius,mssm | PC1 |
| 90252015504 | gastrocnemius,mssm | PC1 |
| 90265015504 | gastrocnemius,mssm | PC1 |
| 90283015504 | gastrocnemius,mssm | PC1 |
| 90420015504 | gastrocnemius,mssm | PC1 |
| 90422015504 | gastrocnemius,mssm | PC1 |
| 90576015504 | gastrocnemius,mssm | PC1,PC2 |
| 90578015504 | gastrocnemius,mssm | PC1 |
| 90222015505 | gastrocnemius,stanford | PC2,PC3 |
| 90567015505 | gastrocnemius,stanford | PC2 |
| 90585015505 | gastrocnemius,stanford | NoMonoDiNucPeaks,PC3 |
| 90267015802 | heart,stanford | NumFiltReads<20M,PC2,PC3 |
| 90567015802 | heart,stanford | NumFiltReads<20M,PC2,PC3 |
| 90259015904 | kidney,stanford | PC3 |
| 90259016602 | lung,stanford | PC2,PC3 |
| 90439016602 | lung,stanford | NoMonoDiNucPeaks,PC2 |
| 90265016602 | lung,stanford | NoMonoDiNucPeaks,PC3 |
| 90267016805 | liver,stanford | NumFiltReads<20M,PC2 |
| 90567016805 | liver,stanford | NumFiltReads<20M,PC2 |
| 90266016904 | brown adipose,stanford | PC2 |
| 90567017011 | white adipose,stanford | NumFiltReads<20M,MacsFrip<0.1,TssEnrich<4,PC3 |
| 90217017011 | white adipose,stanford | NoMonoDiNucPeaks,TssEnrich<4 |
| 90218017011 | white adipose,stanford | NoMonoDiNucPeaks,TssEnrich<4 |
| 90222017011 | white adipose,stanford | NoMonoDiNucPeaks,TssEnrich<4 |
| 90223017011 | white adipose,stanford | NoMonoDiNucPeaks,TssEnrich<4 |
| 90229015802 | heart,stanford | NoMonoDiNucPeaks |
| 90229017011 | white adipose,stanford | NoMonoDiNucPeaks |
| 90232017011 | white adipose,stanford | NoMonoDiNucPeaks |
| 90237017011 | white adipose,stanford | TssEnrich<4 |
| 90239017011 | white adipose,stanford | TssEnrich<4 |
| 90245016805 | liver,stanford | NumFiltReads<20M |
| 90245017003 | white adipose,mssm | TssEnrich<4 |
| 90245017011 | white adipose,stanford | TssEnrich<4 |
| 90248016602 | lung,stanford | NoMonoDiNucPeaks |
| 90251017011 | white adipose,stanford | TssEnrich<4 |
| 90252017003 | white adipose,mssm | TssEnrich<4 |
| 90252017011 | white adipose,stanford | NoMonoDiNucPeaks,TssEnrich<4 |
| 90254017011 | white adipose,stanford | NoMonoDiNucPeaks,TssEnrich<4 |
| 90259017011 | white adipose,stanford | TssEnrich<4 |
| 90265017003 | white adipose,mssm | TssEnrich<4 |
| 90265017011 | white adipose,stanford | TssEnrich<4 |
| 90266016805 | liver,stanford | NumFiltReads<20M |
| 90267017003 | white adipose,mssm | TssEnrich<4 |
| 90281016602 | lung,stanford | NoMonoDiNucPeaks |
| 90283017003 | white adipose,mssm | NoMonoDiNucPeaks |
| 90283017011 | white adipose,stanford | NoMonoDiNucPeaks,TssEnrich<4 |
| 90289015505 | gastrocnemius,stanford | NoMonoDiNucPeaks |
| 90406017011 | white adipose,stanford | TssEnrich<4 |
| 90410017003 | white adipose,mssm | NoMonoDiNucPeaks |
| 90410017011 | white adipose,stanford | NoMonoDiNucPeaks,TssEnrich<4 |
| 90412017011 | white adipose,stanford | NoMonoDiNucPeaks,TssEnrich<4 |
| 90416017011 | white adipose,stanford | TssEnrich<4 |
| 90420016602 | lung,stanford | NoMonoDiNucPeaks |
| 90420017011 | white adipose,stanford | NoMonoDiNucPeaks,TssEnrich<4 |
| 90421016602 | lung,stanford | NoMonoDiNucPeaks |
| 90421017011 | white adipose,stanford | NoMonoDiNucPeaks,TssEnrich<4 |
| 90423017011 | white adipose,stanford | NoMonoDiNucPeaks,TssEnrich<4 |
| 90430017011 | white adipose,stanford | TssEnrich<4 |
| 90449017011 | white adipose,stanford | NoMonoDiNucPeaks,TssEnrich<4 |
| 90560017003 | white adipose,mssm | NoMonoDiNucPeaks |
| 90560017011 | white adipose,stanford | TssEnrich<4 |
| 90564017011 | white adipose,stanford | TssEnrich<4 |
| 90567017003 | white adipose,mssm | TssEnrich<4 |
| 90571017003 | white adipose,mssm | NumFiltReads<20M |
| 90576017011 | white adipose,stanford | TssEnrich<4 |
| 90578016805 | liver,stanford | NumFiltReads<20M |
| 90578017011 | white adipose,stanford | TssEnrich<4 |
| 90581017003 | white adipose,mssm | NoMonoDiNucPeaks |
| 90581017011 | white adipose,stanford | TssEnrich<4 |
| 90585016805 | liver,stanford | NumFiltReads<20M |
| 90585017011 | white adipose,stanford | TssEnrich<4 |
| 90587017011 | white adipose,stanford | TssEnrich<4 |
for(dataset in names(norm_atacseq_data)){
x = norm_atacseq_data[[dataset]]$norm_counts
pheno_data = atacseq_data$pheno[[dataset]]
x_bids = pheno_data[colnames(x),"bid"]
x_pids = pheno_data[colnames(x),"pid"]
newx = cbind(rownames(x),x)
rownames(newx) = NULL
colnames(newx)[1] = "feature_ID"
newx = rbind(c("bid",x_bids),newx)
newx = rbind(c("pid",x_pids),newx)
newx = rbind(c("viallabel",colnames(x)),newx)
colnames(newx) = NULL
# a simple sanity check
m = newx[-c(1:3),-1]
mode(m) = "numeric"
if(!all(abs(m-x)< 1e-05)){
print("Error in parsing the matrix, breaking")
break
}
# save the output to the target bucket
splits = unname(unlist(strsplit(dataset, ',')))
file_name = sprintf("pass1b-06_epigen-atac-seq_%s_%s_quant-norm.tsv",
splits[2],
splits[1])
write_to_bucket(newx, file_name, output_bucket, tmpdir='.', sep='\t', col.names=F, row.names=F, quote=F)
}